Document Summary

1. Data Exploration

2. Data Cleaning

3. Data Pre-processing

4. Feature Selection

Data Exploration

options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("corrplot")
## 
## The downloaded binary packages are in
##  /var/folders/fh/hhgmlzcn0p90_g2xb9ctfx680000gn/T//RtmpvhMj4t/downloaded_packages
library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
library(reshape2)
library(RColorBrewer)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(naniar)
library(missForest)
#install.packages(c("ggplot2", "reshape2"))
library(corrplot)

Read Data File

patients <- read.csv("/Users/arnenyecknyeck/Desktop/Statistical-Inference-Package/patients_dirty_data.csv")

Examining data structure

head(patients)
summary(patients)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI           Pedigree           Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##    Diagnosis    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
colnames(patients)
## [1] "Pregnancies"   "Glucose"       "BloodPressure" "SkinThickness"
## [5] "Insulin"       "BMI"           "Pedigree"      "Age"          
## [9] "Diagnosis"
str(patients)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies  : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose      : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure: int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness: int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin      : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI          : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ Pedigree     : num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age          : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Diagnosis    : int  1 0 1 0 1 0 1 0 1 1 ...

Check missing values

colSums(is.na(patients))
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##             0             0             0             0             0 
##           BMI      Pedigree           Age     Diagnosis 
##             0             0             0             0

Examining positively diagnosed records

Positive <- subset(patients, Diagnosis == 1)
head(Positive, 5)

Visualizing Diagnosis

ggplot(patients, aes(x = factor(Diagnosis))) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Count of Gestational Diabetes Diagnoses",
    x = "Diagnosis (0 = No, 1 = Yes)",
    y = "Count"
  ) +
  theme_minimal()

Checking for skewness

Blood Pressure

ggplot(patients, aes(x = BloodPressure, fill = ..count..)) +
  geom_histogram(binwidth = 10, color = "black") +
  scale_fill_viridis_c() +
  labs(title = "Histogram of Blood Pressure",
       x = "Blood Pressure",
       y = "Count") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

We can say that blood pressure is skewed left by looking at the graph.

Skin Thickness

ggplot(patients, aes(x = SkinThickness, fill = ..count..)) +
  geom_histogram(binwidth = 10, color = "black") +
  scale_fill_viridis_c() +
  labs(title = "Skin Thickness Histogram",
       x = "Skin Thickness",
       y = "Count") +
  theme_minimal()

We can say that skin thickness is skewed left by looking at the graph.

Insulin

ggplot(patients, aes(x = Insulin, fill = ..count..)) +
  geom_histogram(binwidth = 10, color = "black") +
  scale_fill_viridis_c() +
  labs(title = "Insulin Histogram",
       x = "Insulin",
       y = "Count") +
  theme_minimal()

We can say that Insulin is skewed left by looking at the graph.

BMI

ggplot(patients, aes(x = BMI, fill = ..count..)) +
  geom_histogram(binwidth = 2, color = "black") +
  scale_fill_viridis_c() +
  labs(title = "Histogram of BMI",
       x = "BMI",
       y = "Count") +
  theme_minimal()

There is an observed concentration between 20-40

Diagnosis and Glucose

ggplot(patients, aes(x = factor(Diagnosis), y = Glucose, fill = factor(Diagnosis))) +
  geom_boxplot(color = "black") +
  scale_fill_manual(values = c("0" = "#56B4E9", "1" = "#E69F00"),
                    labels = c("No GDM", "GDM")) +
  labs(
    title = "Glucose Levels by Diagnosis",
    x = "Gestational Diabetes Diagnosis",
    y = "Glucose (mg/dL)",
    fill = "Diagnosis"
  ) +
  theme_minimal()

Patients diagnosed with GDM have higher Glucose levels.

Plot variables with each other

Blood pressure and glucose

ggplot(Positive, aes(x = BloodPressure, y = Glucose)) +
  geom_point(color = "blue") +
  labs(title = "BloodPressure & Glucose",
       x = "BloodPressure",
       y = "Glucose") +
  theme_minimal()

Age and Glucose

ggplot(patients, aes(x = Age, y = Glucose, color = as.factor(Diagnosis))) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("0" = "blue", "1" = "red"),
                     name = "Diagnosis",
                     labels = c("Negative", "Positive")) +
  labs(title = "Glucose vs Age Colored by Diagnosis",
       x = "Age",
       y = "Glucose") +
  theme_minimal()

BloodPressure and Age

ggplot(patients, aes(x = Age, y = BloodPressure, color = as.factor(Diagnosis))) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("0" = "blue", "1" = "red"),
                     name = "Diagnosis",
                     labels = c("Negative", "Positive")) +
  labs(title = "BloodPressure vs Age Colored by Diagnosis",
       x = "Age",
       y = "BloodPressure") +
  theme_minimal()

SkinThickness and Age

ggplot(patients, aes(x = Age, y = SkinThickness, color = as.factor(Diagnosis))) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("0" = "blue", "1" = "red"),
                     name = "Diagnosis",
                     labels = c("Negative", "Positive")) +
  labs(title = "SkinThickness vs Age Colored by Diagnosis",
       x = "Age",
       y = "SkinThickness") +
  theme_minimal()

Insulin and Age

ggplot(patients, aes(x = Age, y = Insulin, color = as.factor(Diagnosis))) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("0" = "blue", "1" = "red"),
                     name = "Diagnosis",
                     labels = c("Negative", "Positive")) +
  labs(title = "Insulin vs Age Colored by Diagnosis",
       x = "Age",
       y = "Insulin") +
  theme_minimal()

BMI and Age

ggplot(patients, aes(x = Age, y = BMI, color = as.factor(Diagnosis))) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(values = c("0" = "blue", "1" = "red"),
                     name = "Diagnosis",
                     labels = c("Negative", "Positive")) +
  labs(title = "BMI vs Age Colored by Diagnosis",
       x = "Age",
       y = "BMI") +
  theme_minimal()

Checking correlation between features

# Assuming your data frame is called 'patients'
cor_matrix <- cor(patients[, sapply(patients, is.numeric)], use = "pairwise.complete.obs")
print(cor_matrix)
##               Pregnancies    Glucose BloodPressure SkinThickness     Insulin
## Pregnancies    1.00000000 0.12945867    0.14128198   -0.08167177 -0.07353461
## Glucose        0.12945867 1.00000000    0.15258959    0.05732789  0.33135711
## BloodPressure  0.14128198 0.15258959    1.00000000    0.20737054  0.08893338
## SkinThickness -0.08167177 0.05732789    0.20737054    1.00000000  0.43678257
## Insulin       -0.07353461 0.33135711    0.08893338    0.43678257  1.00000000
## BMI            0.01768309 0.22107107    0.28180529    0.39257320  0.19785906
## Pedigree      -0.03352267 0.13733730    0.04126495    0.18392757  0.18507093
## Age            0.54434123 0.26351432    0.23952795   -0.11397026 -0.04216295
## Diagnosis      0.22189815 0.46658140    0.06506836    0.07475223  0.13054795
##                      BMI    Pedigree         Age  Diagnosis
## Pregnancies   0.01768309 -0.03352267  0.54434123 0.22189815
## Glucose       0.22107107  0.13733730  0.26351432 0.46658140
## BloodPressure 0.28180529  0.04126495  0.23952795 0.06506836
## SkinThickness 0.39257320  0.18392757 -0.11397026 0.07475223
## Insulin       0.19785906  0.18507093 -0.04216295 0.13054795
## BMI           1.00000000  0.14064695  0.03624187 0.29269466
## Pedigree      0.14064695  1.00000000  0.03356131 0.17384407
## Age           0.03624187  0.03356131  1.00000000 0.23835598
## Diagnosis     0.29269466  0.17384407  0.23835598 1.00000000
# Starting from patients data frame:
patients_corr <- patients[, sapply(patients, is.numeric)]

# Now calculate correlation matrix
cor_mat <- cor(patients_corr, use = "pairwise.complete.obs")

corrplot(cor_mat, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45)

let’s add some values

# Now use brewer.pal
cols <- colorRampPalette(brewer.pal(9, "YlGnBu"))(100)

# Then plot
corrplot::corrplot(cor_matrix, method = "color", type = "upper",
                   tl.cex = 0.8,
                   addCoef.col = "black",
                   col = cols)

Data Cleaning

Replace 0s with NA in selected columns (where 0 indicates missing)

# Leaving pregnancies zeros and Diagnosis zeros as these are meaningful
patients$Insulin[patients$Insulin == 0] <- NA
patients$BMI[patients$BMI == 0] <- NA
patients$SkinThickness[patients$SkinThickness == 0] <- NA
patients$BloodPressure[patients$BloodPressure == 0] <- NA
patients$Pedigree[patients$Pedigree == 0] <- NA
patients$Glucose[patients$Glucose == 0] <- NA
patients$Age[patients$Age == 0] <- NA

Explore NA values

#Count NAs
sum(is.na(patients))
## [1] 652
#652 NAs total
sum(apply(patients, 1, function(row) any(is.na(row))))
## [1] 376
#376 rows have a missing value
sum(patients == 0, na.rm = TRUE)
## [1] 611
#611 zeros in total 

Double checking that conversion was successful for each column

sum(patients$Age == 0, na.rm = TRUE)
## [1] 0
#All Age zeros converted to NA
sum(patients$SkinThickness == 0, na.rm = TRUE)
## [1] 0
#All SkinTckness zeros converted to NA
sum(patients$Insulin == 0, na.rm = TRUE)
## [1] 0
#All Insulin zeros have been converted to NA
sum(patients$BMI == 0, na.rm = TRUE)
## [1] 0
#All BMI zeros have been converted to NA
sum(patients$Pedigree == 0, na.rm = TRUE)
## [1] 0
#All pedigree zeros have been converted to NA
sum(patients$Glucose == 0, na.rm = TRUE)
## [1] 0
#All glucose zeros have been converted to NA
sum(patients$BloodPressure == 0, na.rm = TRUE)
## [1] 0
#All blood pressure zeros have been converted to NA
sum(patients$Diagnosis == 0, na.rm = TRUE)
## [1] 500
sum(patients$Diagnosis == 1, na.rm = TRUE)
## [1] 268

Diagnosis and pregnancy zeros have been left as zeros (Binary- positive or negative diagnosis).

268 postive diagnosis in the dataset 500 negative diagnosis in the dataset

Checking Correlation

#correlation matrix

#Ignore NAs and visualize correlation between variables
cor_matrix2 <- cor(patients[, 1:9], use = "complete.obs")
print(cor_matrix2)
##                Pregnancies   Glucose BloodPressure SkinThickness    Insulin
## Pregnancies    1.000000000 0.1982910     0.2133548     0.0932094 0.07898363
## Glucose        0.198291043 1.0000000     0.2100266     0.1988558 0.58122301
## BloodPressure  0.213354775 0.2100266     1.0000000     0.2325712 0.09851150
## SkinThickness  0.093209397 0.1988558     0.2325712     1.0000000 0.18219906
## Insulin        0.078983625 0.5812230     0.0985115     0.1821991 1.00000000
## BMI           -0.025347276 0.2095159     0.3044034     0.6643549 0.22639652
## Pedigree       0.007562116 0.1401802    -0.0159711     0.1604985 0.13590578
## Age            0.679608470 0.3436415     0.3000389     0.1677611 0.21708199
## Diagnosis      0.256565956 0.5157027     0.1926733     0.2559357 0.30142922
##                       BMI     Pedigree        Age Diagnosis
## Pregnancies   -0.02534728  0.007562116 0.67960847 0.2565660
## Glucose        0.20951592  0.140180180 0.34364150 0.5157027
## BloodPressure  0.30440337 -0.015971104 0.30003895 0.1926733
## SkinThickness  0.66435487  0.160498526 0.16776114 0.2559357
## Insulin        0.22639652  0.135905781 0.21708199 0.3014292
## BMI            1.00000000  0.158771043 0.06981380 0.2701184
## Pedigree       0.15877104  1.000000000 0.08502911 0.2093295
## Age            0.06981380  0.085029106 1.00000000 0.3508038
## Diagnosis      0.27011841  0.209329511 0.35080380 1.0000000
corrplot(cor_matrix2, method = "circle")

#Highly correlated features: 
#Age & number of pregnancies- 0.68
#Glucose & Insulin- 0.58
#BMI & skin thickness- 0.66
#Diagnosis & glucose- 0.515
#VIF
#model <- lm(Diagnosis ~ BMI + Age + Pregnancies + Pedigree + Glucose + BloodPressure, Insulin, SkinThickness, data = patients)
model <- lm(Diagnosis ~ BMI + Age + Pregnancies + Pedigree + Glucose + BloodPressure + Insulin + SkinThickness, data = patients)

vif(model)
##           BMI           Age   Pregnancies      Pedigree       Glucose 
##      1.979596      2.129433      1.900719      1.059315      1.670072 
## BloodPressure       Insulin SkinThickness 
##      1.231815      1.556143      1.852772

There are some correlated values, but VIF scores are all below 3, which means there is no multicolinearity.

Investigate missingness

# Show rows with any missing data
patients[!complete.cases(patients), ]
# Visualize missingness by variable
gg_miss_var(patients)

# Visualize missing data patterns
vis_miss(patients)

NA & Zeros Totals

colSums(is.na(patients))
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##             0             5            35           227           374 
##           BMI      Pedigree           Age     Diagnosis 
##            11             0             0             0
colSums(patients == 0, na.rm = TRUE)
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##           111             0             0             0             0 
##           BMI      Pedigree           Age     Diagnosis 
##             0             0             0           500
#3 columns with <35 missing rows
#111 rows where pregnancies == 0
#500 rows where Diagnosis == 0

Find rows where this count is 3 or more

zero_or_na_per_row <- apply(patients, 1, function(row) {
  sum(is.na(row))
})

(rows_3_or_more <- sum(zero_or_na_per_row >= 3))
## [1] 35

We see that there are 35 rows where blood pressure, skin thickness, and insulin are missing together. We choose to remove these rows at less than 4% of the data.

Remove rows with 3 or more NAs

# Remove rows with 3 or more NAs
df_clean <- patients[rowSums(is.na(patients)) < 3, ]

Vizualize missingness together

gg_miss_upset(patients)

SkinThickness and Insulin missing together 192 times. Possible missingness explainations: more invasive testing, or these items are only done for certain clients or appointment types. This should be explored to see if data is MAR, MCAR, or MNAR.

1.

length(patients$Insulin)
## [1] 768
length(patients$SkinThickness)
## [1] 768
patients$insulin_skin_missing <- ifelse(is.na(patients$Insulin) & is.na(patients$SkinThickness), 1, 0)

# Run t-test for BMI
t_result <- t.test(BMI ~ insulin_skin_missing, data = patients)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  BMI by insulin_skin_missing
## t = 2.7363, df = 397.34, p-value = 0.006493
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.4284462 2.6153131
## sample estimates:
## mean in group 0 mean in group 1 
##        32.89573        31.37385
# Run t-test for Pregnancies
t_result <- t.test(Pregnancies ~ insulin_skin_missing, data = patients)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  Pregnancies by insulin_skin_missing
## t = -4.2626, df = 421.29, p-value = 2.496e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.646331 -0.607173
## sample estimates:
## mean in group 0 mean in group 1 
##        3.512015        4.638767
# Run t-test for Age
t_result <- t.test(Age ~ insulin_skin_missing, data = patients)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  Age by insulin_skin_missing
## t = -5.795, df = 360.35, p-value = 1.492e-08
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -7.624804 -3.760948
## sample estimates:
## mean in group 0 mean in group 1 
##        31.55823        37.25110
# Run t-test for Pedigree
t_result <- t.test(Pedigree ~ insulin_skin_missing, data = patients)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  Pedigree by insulin_skin_missing
## t = 4.7151, df = 525.84, p-value = 3.099e-06
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.0650795 0.1580396
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5048503       0.3932907
# Run t-test for Glucose
t_result <- t.test(Glucose ~ insulin_skin_missing, data = patients)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  Glucose by insulin_skin_missing
## t = -1.0567, df = 443.69, p-value = 0.2912
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -7.175451  2.157369
## sample estimates:
## mean in group 0 mean in group 1 
##        120.9403        123.4493
# Run t-test for BloodPressure
t_result <- t.test(BloodPressure ~ insulin_skin_missing, data = patients)
print(t_result)
## 
##  Welch Two Sample t-test
## 
## data:  BloodPressure by insulin_skin_missing
## t = -3.4606, df = 340.58, p-value = 0.0006075
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -5.589413 -1.538200
## sample estimates:
## mean in group 0 mean in group 1 
##        71.46197        75.02577

2.

ggplot(patients, aes(x = factor(insulin_skin_missing), y = Age)) +
  geom_boxplot(fill = c("#a6cee3", "#1f78b4")) +
  labs(x = "Insulin & SkinThickness Missing (0 = No, 1 = Yes)", y = "Age",
       title = "Age Distribution by Missingness of Insulin and SkinThickness") +
  theme_minimal()

ggplot(patients, aes(x = factor(insulin_skin_missing), y = Pregnancies)) +
  geom_boxplot(fill = c("#fdbf6f", "#ff7f00")) +
  labs(x = "Insulin & SkinThickness Missing (0 = No, 1 = Yes)", y = "Number of Pregnancies",
       title = "Pregnancies Distribution by Missingness of Insulin and SkinThickness") +
  theme_minimal()

ggplot(patients, aes(x = factor(insulin_skin_missing), y = Pedigree)) +
  geom_boxplot(fill = c("blue", "forestgreen")) +
  labs(x = "Insulin & SkinThickness Missing (0 = No, 1 = Yes)", y = "Pedigree",
       title = "Pedigree Distribution by Missingness of Insulin and SkinThickness") +
  theme_minimal()

Analysis: There is a statistically significantly difference in several features means and missing or not missing the set of variables. We are suspecting that high missing rates of skin thickness and insulin are missing at random. Upon researching the domain, we suspect that older patients, or patients who have been pregnant a number of times before may often skip these measurements. They are not routine, and are possibly opted for only when there are other predictive factors observed. This indicates the missingness is at random (MAR). Imputation methods should be carefully considered using this information.

Are zeros in ‘Pregnancies’ errors? Or does this column ask about previous pregnancies, not including the current pregnancy?

# Subset rows where Pregnancies == 0
preg0_df <- subset(patients, Pregnancies == 0)

# Count how many of those have Diagnosis == 1 (GDM)
gdm_with_preg0 <- sum(preg0_df$Diagnosis == 1, na.rm = TRUE)

# Total number of rows with Pregnancies == 0
total_preg0 <- nrow(preg0_df)

# Print results
cat("Total rows with Pregnancies == 0:", total_preg0, "\n")
## Total rows with Pregnancies == 0: 111
cat("Rows with Pregnancies == 0 AND Diagnosis == 1 (GDM):", gdm_with_preg0, "\n")
## Rows with Pregnancies == 0 AND Diagnosis == 1 (GDM): 38

There are 38 cases where pregnancy ==0 and diagnosis ==1. We will assume ‘Pregnancies’ == 0 means previous pregnancies.

Check for duplicates

any(duplicated(df_clean))
## [1] FALSE

Recheck NA’s

colSums(is.na(df_clean))
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##             0             5             2           192           339 
##           BMI      Pedigree           Age     Diagnosis 
##             2             0             0             0

Identify and remove columns with fewer than 6 NAs

cols_to_clean <- names(which(colSums(is.na(df_clean)) < 6))
cols_to_clean
## [1] "Pregnancies"   "Glucose"       "BloodPressure" "BMI"          
## [5] "Pedigree"      "Age"           "Diagnosis"
# Keep rows with no NA in those selected columns
df_clean <- df_clean[complete.cases(df_clean[, cols_to_clean]), ]

Recheck

colSums(is.na(df_clean))
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##             0             0             0           192           332 
##           BMI      Pedigree           Age     Diagnosis 
##             0             0             0             0

The only columns with NAs remaining are SkinThickness and Insulin

Reviewing rows remaining and remaining missingness

#Total number of rows
(total_rows <- nrow(df_clean))
## [1] 724
#729 rows remaining after basic cleaning
# Percentage of missing SkinThickness
skin_missing_pct <- sum(is.na(df_clean$SkinThickness)) / total_rows * 100
# Percentage of missing Insulin
insulin_missing_pct <- sum(is.na(df_clean$Insulin)) / total_rows * 100
# Display the results
cat("Percentage of missing SkinThickness values:", round(skin_missing_pct, 2), "%\n")
## Percentage of missing SkinThickness values: 26.52 %
cat("Percentage of missing Insulin values:", round(insulin_missing_pct, 2), "%\n")
## Percentage of missing Insulin values: 45.86 %

27% of Skin thickness values are missing. 46% of Insulin values are missing.

Checking for outliers

vars_to_plot <- c("Insulin", "BMI", "SkinThickness", "BloodPressure", "Pedigree", "Glucose", "Age")

par(mfrow = c(4, 2), mar = c(4, 4, 2, 1))

for (var_name in vars_to_plot) {
  boxplot(df_clean[[var_name]], 
          main = paste(var_name, "Boxplot"),
          horizontal = TRUE, 
          col = "lightblue",
          na.action = na.omit)
}

Count and examine problematic values

# BloodPressure issues (0 or < 40)
bp_issues <- df_clean[ df_clean$BloodPressure < 40, ]
cat("BloodPressure < 40:", sum(df_clean$BloodPressure < 40, na.rm = TRUE), "\n")
## BloodPressure < 40: 4
cat("Total BloodPressure issues:", nrow(bp_issues), "\n\n")
## Total BloodPressure issues: 4

To remove 4 instances of blood pressure errors

# SkinThickness issues (0 values)
skin_issues <- df_clean[df_clean$SkinThickness < 5 | df_clean$SkinThickness > 60, ]
cat("SkinThickness < 5:", sum(df_clean$SkinThickness < 5, na.rm = TRUE), "\n")
## SkinThickness < 5: 0
cat("SkinThickness > 60:", sum(df_clean$SkinThickness > 60, na.rm = TRUE), "\n")
## SkinThickness > 60: 2

To remove 2 Instances of skin thickness errors

# Glucose issues (< 50)
glucose_issues <- df_clean[df_clean$Glucose < 50, ]
cat("Glucose < 50:", nrow(glucose_issues), "\n\n")
## Glucose < 50: 1

To remove 6 instances of glucose errors

#Age
print(which(df_clean$Age > 50))
##  [1]   8  11  12  13  22  26  28  37  41  50  63  86 108 116 122 133 140 176 193
## [20] 195 196 201 210 211 224 247 250 259 263 269 270 279 284 303 325 339 340 341
## [39] 352 365 378 429 431 432 447 451 458 460 465 468 478 481 485 486 488 503 508
## [58] 512 518 523 526 545 548 550 554 567 619 621 627 635 637 661 674 676 691 714
## [77] 716 720
print(which(df_clean$Age < 11))
## integer(0)
#Inspect rows
print(as.data.frame(df_clean[c(8,  11,  12,  13,  21,  25,  27,  36,  40,  49,  61,  84, 106, 114, 119, 130, 137, 173,
                   190, 192, 193, 198, 207, 208, 221, 244, 247, 256, 260, 266, 267, 276, 281, 300, 322, 336,
                   337, 338, 349, 362, 375, 426, 428, 429, 444, 448, 455, 457, 462, 465, 475, 478, 482, 483,
                   485, 500, 505, 509, 515, 520, 523, 544, 546, 550, 561, 613, 615, 621, 629, 631, 655, 668,
                   670, 685, 708, 710, 714), ]))
##     Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI Pedigree Age
## 9             2     197            70            45     543 30.5    0.158  53
## 13           10     139            80            NA      NA 27.1    1.441  57
## 14            1     189            60            23     846 30.1    0.398  59
## 15            5     166            72            19     175 25.8    0.587  51
## 24            9     119            80            35      NA 29.0    0.263  29
## 28            1      97            66            15     140 23.2    0.487  22
## 30            5     117            92            NA      NA 34.1    0.337  38
## 39            2      90            68            42      NA 38.2    0.503  27
## 43            7     106            92            18      NA 22.7    0.235  48
## 53            5      88            66            21      23 24.4    0.342  30
## 66            5      99            74            27      NA 29.0    0.203  32
## 92            4     123            80            15     176 32.0    0.443  34
## 114           4      76            62            NA      NA 34.0    0.391  25
## 122           6     111            64            39      NA 34.2    0.260  24
## 127           3     120            70            30     135 42.9    0.452  30
## 138           0      93            60            25      92 28.7    0.532  22
## 145           4     154            62            31     284 32.8    0.237  23
## 184           5      73            60            NA      NA 26.8    0.268  27
## 202           1     138            82            NA      NA 40.1    0.236  28
## 204           2      99            70            16      44 20.4    0.235  27
## 205           6     103            72            32     190 37.7    0.324  55
## 210           7     184            84            33      NA 35.5    0.355  41
## 219           5      85            74            22      NA 29.0    1.224  32
## 220           5     112            66            NA      NA 37.8    0.261  41
## 234           4     122            68            NA      NA 35.0    0.394  29
## 257           3     111            56            39      NA 30.1    0.557  30
## 260          11     155            76            28     150 33.3    1.353  51
## 272           2     108            62            32      56 25.2    0.128  21
## 276           2     100            70            52      57 40.5    0.677  25
## 282          10     129            76            28     122 35.9    0.280  39
## 283           7     133            88            15     155 32.4    0.262  37
## 292           0     107            62            30      74 36.6    0.757  25
## 297           2     146            70            38     360 28.0    0.337  29
## 317           3      99            80            11      64 19.3    0.284  30
## 341           1     130            70            13     105 25.9    0.472  22
## 359          12      88            74            40      54 35.3    0.378  48
## 360           1     196            76            36     249 36.5    0.875  29
## 361           5     189            64            33     325 31.2    0.583  29
## 373           0      84            64            22      66 35.8    0.545  21
## 386           1     119            54            13      50 22.3    0.205  24
## 399           3      82            70            NA      NA 21.1    0.389  25
## 453           0      91            68            32     210 39.9    0.381  25
## 456          14     175            62            30      NA 33.6    0.212  38
## 457           1     135            54            NA      NA 26.7    0.687  62
## 473           0     119            66            27      NA 38.8    0.259  22
## 477           2     105            80            45     191 33.7    0.711  29
## 484           0      84            82            31     125 38.2    0.233  23
## 487           1     139            62            41     480 40.7    0.536  21
## 492           2      89            90            30      NA 33.5    0.292  42
## 496           6     166            74            NA      NA 26.6    0.304  66
## 507           0     180            90            26      90 36.5    0.314  35
## 510           8     120            78            NA      NA 25.0    0.409  64
## 514           2      91            62            NA      NA 27.3    0.525  22
## 515           3      99            54            19      86 25.6    0.154  24
## 517           9     145            88            34     165 30.3    0.771  53
## 533           1      86            66            52      65 41.3    0.917  29
## 540           3     129            92            49     155 36.4    0.968  32
## 544           4      84            90            23      56 39.5    0.159  25
## 550           4     189           110            31      NA 28.5    0.680  37
## 555           1      84            64            23     115 36.9    0.471  28
## 558           8     110            76            NA      NA 27.8    0.237  58
## 579          10     133            68            NA      NA 27.0    0.245  36
## 581           0     151            90            46      NA 42.1    0.371  21
## 585           8     124            76            24     600 28.7    0.687  52
## 597           0      67            76            NA      NA 45.3    0.194  46
## 653           5     123            74            40      77 34.1    0.269  28
## 655           1     106            70            28     135 34.2    0.142  22
## 661          10     162            84            NA      NA 27.7    0.182  54
## 669           6      98            58            33     190 34.0    0.430  43
## 671           6     165            68            26     168 33.6    0.631  49
## 696           7     142            90            24     480 30.4    0.128  43
## 712           5     126            78            27      22 29.6    0.439  40
## 714           0     134            58            20     291 26.4    0.352  21
## 729           2     175            88            NA      NA 22.9    0.326  22
## 752           1     121            78            39      74 39.0    0.261  28
## 754           0     181            88            44     510 43.3    0.222  26
## 758           0     123            72            NA      NA 36.3    0.258  52
##     Diagnosis
## 9           1
## 13          0
## 14          1
## 15          1
## 24          1
## 28          0
## 30          0
## 39          1
## 43          0
## 53          0
## 66          0
## 92          0
## 114         0
## 122         0
## 127         0
## 138         0
## 145         0
## 184         0
## 202         0
## 204         0
## 205         0
## 210         1
## 219         1
## 220         1
## 234         0
## 257         0
## 260         1
## 272         0
## 276         0
## 282         0
## 283         0
## 292         1
## 297         1
## 317         0
## 341         0
## 359         0
## 360         1
## 361         1
## 373         0
## 386         0
## 399         0
## 453         0
## 456         1
## 457         0
## 473         0
## 477         1
## 484         0
## 487         0
## 492         0
## 496         0
## 507         1
## 510         0
## 514         0
## 515         0
## 517         1
## 533         0
## 540         1
## 544         0
## 550         0
## 555         0
## 558         0
## 579         0
## 581         1
## 585         1
## 597         0
## 653         0
## 655         0
## 661         0
## 669         0
## 671         0
## 696         1
## 712         0
## 714         0
## 729         0
## 752         0
## 754         1
## 758         1
# Specify the row indices to remove
rows_to_remove <- c(8, 11, 12, 13, 21, 25, 27, 36, 40, 49, 61, 84, 106, 114, 119, 130, 137, 173,
                    190, 192, 193, 198, 207, 208, 221, 244, 247, 256, 260, 266, 267, 276, 281, 300,
                    322, 336, 337, 338, 349, 362, 375, 426, 428, 429, 444, 448, 455, 457, 462, 465,
                    475, 478, 482, 483, 485, 500, 505, 509, 515, 520, 523, 544, 546, 550, 561, 613,
                    615, 621, 629, 631, 655, 668, 670, 685, 708, 710, 714)

# Create new cleaned dataset by removing those rows
df_clean_2 <- df_clean[-rows_to_remove, ]

print(df_clean[c(419, 545, 16,  58, 118, 562, 564), ])
##     Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI Pedigree Age
## 446           0     180            78            63      14 59.4    2.420  25
## 580           2     197            70            99      NA 34.7    0.575  62
## 19            1     103            30            38      83 43.3    0.183  33
## 63            5      44            62            NA      NA 25.0    0.587  36
## 126           1      88            30            42      99 55.0    0.496  26
## 598           1      89            24            19      25 27.8    0.559  21
## 600           1     109            38            18     120 23.1    0.407  26
##     Diagnosis
## 446         1
## 580         1
## 19          0
## 63          0
## 126         1
## 598         0
## 600         0
#row 419 looks normal comparing to bmi- keep

# Remove rows 545, 16, 58, 118, 562, 564
rows_to_remove <- c(545, 16, 58, 118, 562, 564)

# Create new dataframe without these rows
df_clean_2 <- df_clean_2[-rows_to_remove, ]

Concise removal of non-realistic values

df_clean_2 <- df_clean %>%
  filter(BloodPressure >= 40) %>%
  filter(SkinThickness >= 5 & SkinThickness <= 60) %>%
  filter(Glucose >= 50) %>%
  filter(Age >= 11 & Age <= 50)
df_clean_2 <- df_clean %>%
  dplyr::filter(Age >= 11 & Age <= 50,
                Glucose >= 50,
                SkinThickness >= 5 & SkinThickness <= 60,
                BloodPressure >= 40)

Check

summary(df_clean_2)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 56.0   Min.   : 40.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 97.0   1st Qu.: 64.00   1st Qu.:21.25  
##  Median : 2.000   Median :112.5   Median : 70.00   Median :29.00  
##  Mean   : 3.294   Mean   :118.9   Mean   : 71.12   Mean   :28.97  
##  3rd Qu.: 5.000   3rd Qu.:137.0   3rd Qu.: 78.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :110.00   Max.   :60.00  
##                                                                   
##     Insulin           BMI           Pedigree           Age      
##  Min.   : 15.0   Min.   :18.20   Min.   :0.0850   Min.   :21.0  
##  1st Qu.: 75.5   1st Qu.:27.82   1st Qu.:0.2580   1st Qu.:23.0  
##  Median :120.0   Median :32.80   Median :0.4055   Median :27.0  
##  Mean   :149.2   Mean   :32.82   Mean   :0.4938   Mean   :29.5  
##  3rd Qu.:182.0   3rd Qu.:36.88   3rd Qu.:0.6535   3rd Qu.:35.0  
##  Max.   :744.0   Max.   :67.10   Max.   :2.3290   Max.   :50.0  
##  NA's   :127                                                    
##    Diagnosis     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3086  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
## 
str(df_clean_2)
## 'data.frame':    486 obs. of  9 variables:
##  $ Pregnancies  : int  6 1 1 0 3 0 1 3 9 10 ...
##  $ Glucose      : int  148 85 89 137 78 118 115 126 119 125 ...
##  $ BloodPressure: int  72 66 66 40 50 84 70 88 80 70 ...
##  $ SkinThickness: int  35 29 23 35 32 47 30 41 35 26 ...
##  $ Insulin      : int  NA NA 94 168 88 230 96 235 NA 115 ...
##  $ BMI          : num  33.6 26.6 28.1 43.1 31 45.8 34.6 39.3 29 31.1 ...
##  $ Pedigree     : num  0.627 0.351 0.167 2.288 0.248 ...
##  $ Age          : int  50 31 21 33 26 31 32 27 29 41 ...
##  $ Diagnosis    : int  1 0 0 1 1 1 1 0 1 1 ...

IMPUTATION.

We will create two different sets of data: One with Insulin and SkinThickness imputed with RF, and the other imputed with MICE

Mice Imputation

data_to_be_imputed = df_clean_2
imputed_data <- mice(data_to_be_imputed, method = "pmm", m = 5, seed = 123)
## 
##  iter imp variable
##   1   1  Insulin
##   1   2  Insulin
##   1   3  Insulin
##   1   4  Insulin
##   1   5  Insulin
##   2   1  Insulin
##   2   2  Insulin
##   2   3  Insulin
##   2   4  Insulin
##   2   5  Insulin
##   3   1  Insulin
##   3   2  Insulin
##   3   3  Insulin
##   3   4  Insulin
##   3   5  Insulin
##   4   1  Insulin
##   4   2  Insulin
##   4   3  Insulin
##   4   4  Insulin
##   4   5  Insulin
##   5   1  Insulin
##   5   2  Insulin
##   5   3  Insulin
##   5   4  Insulin
##   5   5  Insulin
imputed_df <- complete(imputed_data)  # gets the first completed dataset

# Check NAs
colSums(is.na(imputed_df))
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##             0             0             0             0             0 
##           BMI      Pedigree           Age     Diagnosis 
##             0             0             0             0
summary(imputed_df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 56.0   Min.   : 40.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 97.0   1st Qu.: 64.00   1st Qu.:21.25  
##  Median : 2.000   Median :112.5   Median : 70.00   Median :29.00  
##  Mean   : 3.294   Mean   :118.9   Mean   : 71.12   Mean   :28.97  
##  3rd Qu.: 5.000   3rd Qu.:137.0   3rd Qu.: 78.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :110.00   Max.   :60.00  
##     Insulin           BMI           Pedigree           Age      
##  Min.   : 15.0   Min.   :18.20   Min.   :0.0850   Min.   :21.0  
##  1st Qu.: 75.0   1st Qu.:27.82   1st Qu.:0.2580   1st Qu.:23.0  
##  Median :115.0   Median :32.80   Median :0.4055   Median :27.0  
##  Mean   :146.2   Mean   :32.82   Mean   :0.4938   Mean   :29.5  
##  3rd Qu.:180.0   3rd Qu.:36.88   3rd Qu.:0.6535   3rd Qu.:35.0  
##  Max.   :744.0   Max.   :67.10   Max.   :2.3290   Max.   :50.0  
##    Diagnosis     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3086  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
head(imputed_df)
#Final imputed dataset
rf_dataset_imputed <- imputed_df

write.csv(rf_dataset_imputed, "/Users/arnenyecknyeck/Desktop/Statistical-Inference-Package/completed_data_mice.csv", row.names = FALSE)

RF Imputation

set.seed(123)  # For reproducibility

# Save outcome variable (Diagnosis) separately
outcome_var <- data_to_be_imputed$Diagnosis

# Remove outcome before imputation
df_features_only <- data_to_be_imputed[, !(names(data_to_be_imputed) %in% "Diagnosis")]

# Check Insulin and SkinThickness presence before imputation
stopifnot("Insulin" %in% colnames(df_features_only))
stopifnot("SkinThickness" %in% colnames(df_features_only))

# Perform RF imputation on features only
rf_imputed <- missForest(df_features_only, maxiter = 10, ntree = 100)

# Extract completed data
rf_dataset_imputed <- rf_imputed$ximp

# Add outcome variable back
rf_dataset_imputed$Diagnosis <- outcome_var

# Check imputation results
colSums(is.na(rf_dataset_imputed))
##   Pregnancies       Glucose BloodPressure SkinThickness       Insulin 
##             0             0             0             0             0 
##           BMI      Pedigree           Age     Diagnosis 
##             0             0             0             0
summary(rf_dataset_imputed)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 56.0   Min.   : 40.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 97.0   1st Qu.: 64.00   1st Qu.:21.25  
##  Median : 2.000   Median :112.5   Median : 70.00   Median :29.00  
##  Mean   : 3.294   Mean   :118.9   Mean   : 71.12   Mean   :28.97  
##  3rd Qu.: 5.000   3rd Qu.:137.0   3rd Qu.: 78.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :110.00   Max.   :60.00  
##     Insulin            BMI           Pedigree           Age      
##  Min.   : 15.00   Min.   :18.20   Min.   :0.0850   Min.   :21.0  
##  1st Qu.: 84.66   1st Qu.:27.82   1st Qu.:0.2580   1st Qu.:23.0  
##  Median :120.00   Median :32.80   Median :0.4055   Median :27.0  
##  Mean   :147.05   Mean   :32.82   Mean   :0.4938   Mean   :29.5  
##  3rd Qu.:180.00   3rd Qu.:36.88   3rd Qu.:0.6535   3rd Qu.:35.0  
##  Max.   :744.00   Max.   :67.10   Max.   :2.3290   Max.   :50.0  
##    Diagnosis     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3086  
##  3rd Qu.:1.0000  
##  Max.   :1.0000
head(rf_dataset_imputed)
write.csv(rf_dataset_imputed, "/Users/arnenyecknyeck/Desktop/Statistical-Inference-Package/completed_data_rf.csv", row.names = FALSE)